Libraries

library(stringr)
library(DESeq2)
library(vsn)
library(ggplot2)
library(apeglm)
library(dplyr)
library(magrittr)
library(pheatmap)
library(EnhancedVolcano)
library(viridis)
library(biomaRt)
library(PCAtools)
library(UpSetR)
library(edgeR)
library(scopetools)

Functions

Result Table & HeatMap Table Generation

Result Table

restable <- function(min_fc = 2, max_p = 0.05, n_top_genes = 20, deseq, cont, exclude_gene = ""){
  MIN_L2FC=log2(min_fc)
  res_table <- results(deseq, independentFiltering = TRUE, contrast = cont) %>% as.data.frame(.) %>% dplyr::select(everything()) %>%
    filter(log2FoldChange >= min_fc | log2FoldChange <= -min_fc ) %>%
    filter(padj <= max_p) %>%
    arrange(padj)
  if(exclude_gene != ""){
    res_table <- res_table[-which(rownames(res_table) %in% exclude_gene),] %>% arrange(padj)
  }
  print(res_table[1:n_top_genes, ])
  return(res_table)
}

Heatmap Table

#include a exclude genes so that some background genes might be excluded; select top = 40 based on p values
heattable <- function(deseq, res_table, select_string = "^(ERV|HERV|LTR|MER|HML).+",min_fc = 2, max_p = 0.05, select_genes = 40, exclude_genes = ""){
  #rlog scaled
  deseq.vst <- varianceStabilizingTransformation(deseq, blind = TRUE)
  #select padj
  seg.herv <- subset(res_table, padj < max_p & (log2FoldChange > log2(min_fc) | log2FoldChange < -log2(min_fc)))
  #Select the HREVs/Genes
  herv_names  <- str_extract(rownames(seg.herv), select_string)
  seg.herv.1 <- seg.herv[(rownames(seg.herv) %in%  herv_names),]
  seg.herv.1 <- seg.herv.1[!(rownames(seg.herv.1) %in% exclude_genes),]
  if(nrow(seg.herv.1) < select_genes){
    print(paste0("Only ", nrow(seg.herv.1), "rows significant after exclusion. Keeping all."))
    vst.herv <- deseq.vst[rownames(seg.herv.1),] %>% assay
  }else{
    seg.herv.1 <- arrange(seg.herv.1, padj)
    vst.herv <- deseq.vst[rownames(seg.herv.1[1:select_genes,]),] %>% assay
  }
  return(vst.herv)
}

Load Data

#gene annotation
load("/Users/phoebefei/Library/CloudStorage/OneDrive-med.cornell.edu/WCM/Metastatic UM Prj/primary_UM/analysis2/03-gene_data.Rdata")
#genes.annot
load("/Users/phoebefei/Library/CloudStorage/OneDrive-med.cornell.edu/WCM/Metastatic UM Prj/Rdatas/count_data.RData")
load("/Users/phoebefei/Library/CloudStorage/OneDrive-med.cornell.edu/WCM/Metastatic UM Prj/Rdatas/metadata.RData")

Interested HERVs

herv_profile <- c('ERVLE_17p11.2c','LTR46_Xq11.1','HERVH_6q24.1b','HERVH_3q21.3a','MER4B_3q21.2','HERVH_10q23.32','HML3_19q13.2','HERVI_9q22.1','MER4_1p21.3','HML3_3q21.2','ERVLE_5q13.2a','HERV3_19q13.42a','HERVE_Xp11.23','ERV316A3_6p21.33c','MER101_19q13.2c','ERVLE_2p13.2a','MER4B_12q22')

Take out the pre-treatment subgroup for mskcc, non-cell-line group for nilsson

meta_sub <- mskcc_meta[which(mskcc_meta$Treatment_stage == "Pre-tx"),] %>% arrange(RECIST_pre_tx)
nilson_sub <- nilsson_meta[which(nilsson_meta$Cell_line == "Liver metastasis" | nilsson_meta$Cell_line == "Subcutaneous metastasis"),]

Combine the metadata

inter_col <- intersect(intersect(colnames(meta_sub), colnames(nilson_sub)), colnames(TCGA_meta))
pm_meta <- rbind(meta_sub[,inter_col], nilson_sub[,inter_col], TCGA_meta[,inter_col]) %>% arrange(Cluster)

Combine HERV+Gene

all_bind <- rbind(all_genes, all_hervs[,colnames(all_genes)])

DESeq Objects

HERV+Gene in All samples PCA Plotting

DESeq.all <- DESeqDataSetFromMatrix(countData = as.matrix(all_bind[,rownames(metadata_all)]),
                                   colData = metadata_all,
                                   design = ~ Batch) 
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Mean SD Plot + PCA

par(mar=c(11, 8.1, 4.1, 2.1))
#size factor
DESeq.all <- estimateSizeFactors(DESeq.all)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
#normalize & log-sized
DESeq.all_norm <- counts(DESeq.all, normalized = TRUE)
boxplot(log2(DESeq.all_norm+1), notch=FALSE, main = "Size-factor-normalized read counts", ylab="log2(read counts)", cex = .6, las = 2)

assay(DESeq.all,"log.norm.counts") <- log2(DESeq.all_norm+1)

#SD and PCA plot
msd.all <- vsn::meanSdPlot(DESeq.all@assays@data@listData$log.norm.counts, ranks=FALSE, plot = FALSE)
msd.all$gg + ggtitle("Sequencing depth normalized log2(read counts)") + ylab("standard deviation")

DESeq.all$Age <- as.numeric(DESeq.all$Age)
#PCA
DESeq.all.trans <- DESeq(DESeq.all)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 848 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
all.vst <- assay(vst(DESeq.all.trans))
all.pca <- pca(all.vst, metadata = colData(DESeq.all), removeVar = 0.1)
## -- removing the lower 10% of variables based on variance

PCA Visualize

screeplot(all.pca, axisLabSize = 18, titleLabSize = 22) + ggtitle("SCREE Plot for Gene + HERVs")
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

biplot(all.pca, lab = NULL, colby = "Batch", legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by Batch")

biplot(all.pca, lab = NULL, colby = "toMet", legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by Development of Metastasis")

biplot(all.pca, lab = NULL, colby = "Age", legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by Age")

biplot(all.pca, lab = NULL, colby = "BAP1",legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by BAP1 Status")

biplot(all.pca, lab = NULL, colby = "BAP1_type",legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by BAP1 Type")

biplot(all.pca, lab = NULL, colby = "Sex",legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by Sex")

biplot(all.pca, lab = all.pca$metadata$Sample, colby = "Cell_line", legendPosition = 'right') + ggtitle("PCA, Gene+HERV\nColor by Cell Line")
## Warning: ggrepel: 118 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#ggsave("Gene_HERV_all.png")

Correct on the batch: compare both mets data

batch_cols <- intersect(colnames(nilson_sub),colnames(meta_sub))
batch_meta <- rbind(nilson_sub[,batch_cols], meta_sub[,batch_cols])
#create the object
DESeq.herv.batch <- DESeqDataSetFromMatrix(countData = as.matrix(mets_hervs[,rownames(batch_meta)]),
                                   colData = batch_meta,
                                   design = ~ Batch) 
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

Process count info

#size factor
DESeq.herv.batch <- estimateSizeFactors(DESeq.herv.batch)
#normalize & log-sized
DESeq.herv.batch_norm <- counts(DESeq.herv.batch, normalized = TRUE)
boxplot(log2(DESeq.herv.batch_norm+1), notch=FALSE, main = "Size-factor-normalized read counts", ylab="log2(read counts)", cex = .6, las = 2)

assay(DESeq.herv.batch,"log.norm.counts") <- log2(DESeq.herv.batch_norm+1)
#SD and PCA plot
msd.herv.batch <- vsn::meanSdPlot(DESeq.herv.batch@assays@data@listData$log.norm.counts, ranks=FALSE, plot = FALSE)
msd.herv.batch$gg + ggtitle("Sequencing depth normalized log2(read counts)") + ylab("standard deviation")

DESeq.herv.batch.trans <- DESeq(DESeq.herv.batch)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 784 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
herv.batch.vst <- assay(varianceStabilizingTransformation(DESeq.herv.batch.trans))
herv.batch.pca <- pca(herv.batch.vst, metadata = colData(DESeq.herv.batch.trans), removeVar = 0.1)
## -- removing the lower 10% of variables based on variance

PCA Visualize

screeplot(herv.batch.pca, axisLabSize = 18, titleLabSize = 22) + ggtitle("SCREE Plot for HERVs, 2 Metastatic Samples")
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

biplot(herv.batch.pca, lab = NULL, colby = "Batch",legendPosition = 'right') + ggtitle("PCA, HERVs 2 Metastatic Samples\nColor by Batch")

biplot(herv.batch.pca, lab = NULL, colby = "BAP1", legendPosition = 'right') + ggtitle("PCA, HERVs 2 Metastatic Samples\nColor by BAP1 Status")

biplot(herv.batch.pca, lab = NULL, colby = "BAP1_type",legendPosition = 'right') + ggtitle("PCA, HERVs 2 Metastatic Samples\nColor by BAP1 Mutation Type")

biplot(herv.batch.pca, lab = NULL, colby = "Sex", legendPosition = 'right') + ggtitle("PCA, HERVs 2 Metastatic Samples\nColor by Sex")

biplot(herv.batch.pca, lab = herv.batch.pca$metadata$Sample, colby = "Cell_line", legendPosition = 'right') + ggtitle("PCA, HERVs 2 Metastatic Samples\nColor by Sample Cell Line")
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#ggsave("PCA Met Batches.png")
#Run the DESeq with LRT method
DESeq.herv.batch <- DESeq(DESeq.herv.batch, test = "LRT", reduced = ~1)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 828 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

DE table on HERV

#resultsNames(DESeq.herv.batch)
#[1] "Intercept"              "Batch_Nilsson_vs_MSKCC"
herv_res_batch <- restable(min_fc = 2, max_p = 0.01, n_top_genes = 20, DESeq.herv.batch)
##                      baseMean log2FoldChange     lfcSE      stat       pvalue
## L1FLnI_3p22.1e      36.779406       9.231647 0.7432976 208.14527 3.487584e-47
## L1FLnI_5q15xa       14.958638      -6.148193 0.5897376 129.88084 4.351317e-30
## L1FLnI_9p22.1c       5.970992      -3.592273 0.3830545 112.58449 2.660810e-26
## L1FLnI_6q16.1c      14.321941       7.552989 0.8040451  87.90588 6.864430e-21
## L1FLnI_7q22.3h       2.717612      -3.955697 0.5004883  83.85407 5.326795e-20
## HERVH_16p13.2e      63.295987      -2.293550 0.3055782  66.05055 4.395041e-16
## L1FLnI_11p15.4m   1814.397493      -5.766648 0.7530934  63.89403 1.312950e-15
## HERVH_4p16.1a       33.990936      -3.005839 0.4187236  61.34490 4.790307e-15
## L1FLnI_16q21r        3.273709      -5.240182 0.7319680  59.92167 9.870883e-15
## HERVE_12p13.31a     23.220900      -4.124862 0.6038672  55.13621 1.124600e-13
## ERV316A3_3q13.12c   44.079502      -4.022609 0.5983455  53.23314 2.962186e-13
## L1FLnI_9q21.31q      5.153661      -5.563388 0.8850657  44.95409 2.017081e-11
## HERVH_9q22.32       10.907592      -4.782847 0.7846673  43.55104 4.130452e-11
## HERVH_12p13.31b      6.053423      -3.821867 0.6384273  42.67286 6.470376e-11
## ERVLE_13q12.13b      5.919695      -3.321362 0.5610240  41.33452 1.282833e-10
## HML1_11q13.4         5.598164      -2.433965 0.4114061  40.65495 1.816262e-10
## MER41_Xp11.22        3.931869      -3.223323 0.5580736  39.86525 2.721011e-10
## HERVL40_4q31.3b     20.845708      -2.430624 0.4203000  39.11520 3.995224e-10
## HERVH_1q31.1a        7.566545      -4.794322 0.8188462  39.16794 3.888737e-10
## L1FLnI_12q14.1m     10.306818       7.105848 0.9582216  39.06100 4.107684e-10
##                           padj
## L1FLnI_3p22.1e    2.798786e-43
## L1FLnI_5q15xa     1.745966e-26
## L1FLnI_9p22.1c    7.117667e-23
## L1FLnI_6q16.1c    1.377176e-17
## L1FLnI_7q22.3h    8.549506e-17
## HERVH_16p13.2e    5.878367e-13
## L1FLnI_11p15.4m   1.505204e-12
## HERVH_4p16.1a     4.805276e-12
## L1FLnI_16q21r     8.801537e-12
## HERVE_12p13.31a   8.204470e-11
## ERV316A3_3q13.12c 1.980962e-10
## L1FLnI_9q21.31q   1.156219e-08
## HERVH_9q22.32     2.209792e-08
## HERVH_12p13.31b   3.054398e-08
## ERVLE_13q12.13b   5.719299e-08
## HML1_11q13.4      7.671316e-08
## MER41_Xp11.22     1.091806e-07
## HERVL40_4q31.3b   1.433225e-07
## HERVH_1q31.1a     1.433225e-07
## L1FLnI_12q14.1m   1.433225e-07
batch_hervs <- rownames(herv_res_batch)

Unsupervised Clustering of the samples

breaklist <- seq(-3, 3, by = 0.25)
herv_res_heat <- heattable(DESeq.herv.batch, herv_res_batch, select_string = "^(ERV|HERV|LTR|MER|HML).+",min_fc = 2, max_p = 0.01, select_genes = 50, exclude_genes = "")
pheatmap(herv_res_heat, scale="none",main = "Hirachical Clustering Heatmap of Met Samples",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = TRUE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = batch_meta[,"Batch", drop = FALSE])

Primary vs. Mets

Run Primary vs. Metastatic

#create the object
DESeq.herv.pm <- DESeqDataSetFromMatrix(countData = as.matrix(all_hervs[,rownames(pm_meta)]),
                                   colData = pm_meta,
                                   design = ~ Status) 
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.herv.pm$Status <- relevel(DESeq.herv.pm$Status, ref = "Primary")

Process count info

#size factor
DESeq.herv.pm <- estimateSizeFactors(DESeq.herv.pm)
#normalize & log-sized
DESeq.herv.pm_norm <- counts(DESeq.herv.pm, normalized = TRUE)
boxplot(log2(DESeq.herv.pm_norm+1), notch=FALSE, main = "Size-factor-normalized read counts", ylab="log2(read counts)", cex = .6, las = 2)

assay(DESeq.herv.pm,"log.norm.counts") <- log2(DESeq.herv.pm_norm+1)
#SD and PCA plot
msd.herv.pm <- vsn::meanSdPlot(DESeq.herv.pm@assays@data@listData$log.norm.counts, ranks=FALSE, plot = FALSE)
msd.herv.pm$gg + ggtitle("Sequencing depth normalized log2(read counts)") + ylab("standard deviation")

DESeq.herv.pm.trans <- DESeq(DESeq.herv.pm)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 384 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
herv.pm.vst <- assay(varianceStabilizingTransformation(DESeq.herv.pm.trans))
herv.pm.pca <- pca(herv.pm.vst, metadata = colData(DESeq.herv.pm.trans), removeVar = 0.1)
## -- removing the lower 10% of variables based on variance

PCA Visualize

screeplot(herv.pm.pca, axisLabSize = 18, titleLabSize = 22) + ggtitle("SCREE Plot for HERVs, Primary + Metastatic")
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

biplot(herv.pm.pca, lab = NULL, colby = "Cluster",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Cluster")

biplot(herv.pm.pca, lab = NULL, colby = "Status",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Status")

biplot(herv.pm.pca, lab = NULL, colby = "toMet",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Metastasis Development")

biplot(herv.pm.pca, lab = NULL, colby = "Batch",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Batch")

biplot(herv.pm.pca, lab = NULL, colby = "GNAQ_site", legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by GNAQ Mutation Site")

biplot(herv.pm.pca, lab = NULL, colby = "BAP1", legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by BAP1 Status")

biplot(herv.pm.pca, lab = NULL, colby = "BAP1_type",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by BAP1 Mutation Type")

biplot(herv.pm.pca, lab = NULL, colby = "Sex", legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Sex")

biplot(herv.pm.pca, lab = NULL, colby = "SF3B1",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by SF3B1 Status")

options(ggrepel.max.overlaps = Inf)
biplot(herv.pm.pca, lab = herv.pm.pca$metadata$Sample, colby = "Cell_line",legendPosition = 'right') + ggtitle("PCA, HERVs Primary + Mets\nColor by Cell Line")
## Warning: ggrepel: 94 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#ggsave("all HERVs Cell Line.png")

PCA Loading Plot

plotloadings(herv.pm.pca,
    components = getComponents(herv.pm.pca, c(1,2)),
    rangeRetain = 0.1,
    labSize = 4.0,
    title = 'Loadings plot for primary + metastatic HERVs',
    subtitle = 'PC1, PC2',
    caption = 'Top 1% variables',
    shape = 24,
    col = c('limegreen', 'black', 'red3'),
    drawConnectors = TRUE)
## -- variables retained:
## MER101_16p12.2a, HERVH_13q22.3, HERVH_4p15.1a, PABLB_5q12.2, HERVH_10q23.32, HERVI_9q22.1, HERVH_6q24.1b, PRIMA41_13q22.3, MER4_Xq21.33c, HML6_3p22.3, ERVL_9q31.1, MER4B_9q31.1b, HERVL_21q22.2, ERVLE_2q24.3b, MER61_4q31.21b, HUERSP3B_Xq13.2b, ERV316A3_14q21.3c, HML5_12q12b, HERV9_6p21.2, HERVL_12q12b, HERVL_2q22.3a, HERVH_2q33.1a, MER41_15q13.3, HERVW_8q21.13, HERVIP10FH_5q32, MER4B_5q14.1, MER41_12q12, HUERSP3_6q22.31, MER41_5q15a, LTR25_11p14.1, ERVLB4_7q31.1c, ERVLB4_6q23.2a, ERVLE_15q12b, MER41_6q22.31b, HERVEA_4q22.1, MER61_16q12.1, HERVEA_Xp11.21, HERVFRD_5q11.2, HERVL_1q25.2, ERVLE_11q14.2c, HARLEQUIN_1q42.13a, MER41_15q21.2a, ERV316A3_14q21.3b, MER61_Xq11.1, HML3_7q36.3

   #max.overlap = Inf
#DESeq.herv.pm$Cluster <- relevel(DESeq.herv.pm$Cluster, ref = "Metastatic")
DESeq.herv.pm <- DESeq(DESeq.herv.pm,parallel = T,test = "LRT", reduced = ~1)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates: 6 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 6 workers
## -- replacing outliers and refitting for 401 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

DE table on HERV

#resultsNames(DESeq.herv.pm)
#"Intercept"                    "Status_Primary_vs_Metastatic"
#Mets vs. All
herv_res_pm<-restable(min_fc = 2, max_p = 0.01, n_top_genes = 20, DESeq.herv.pm, cont = c("Status","Metastatic","Primary"))
##                       baseMean log2FoldChange     lfcSE     stat       pvalue
## HML2_3q13.2           2.767446       5.240929 0.4065786 185.9204 2.470963e-42
## ERV316A3_19q13.2     12.567693       4.043621 0.3113588 179.5517 6.071500e-41
## HML2_11q23.3          8.365531       6.028128 0.4690936 178.1348 1.237872e-40
## HML2_19q11           20.850587       8.142469 0.5922696 174.7286 6.862310e-40
## HERVL_3q21.2         31.807864       5.196241 0.4022239 170.2177 6.631538e-39
## HERVIP10FH_17q21.32   6.563468       2.413798 0.1989844 162.2533 3.642169e-37
## HML2_3q27.2           8.151993       5.173398 0.4267370 159.7918 1.256427e-36
## HML3_7q36.3         183.126780      13.328096 0.7746528 148.7669 3.224660e-34
## HERVS71_11q13.2      29.115358       6.742577 0.5465766 142.4691 7.679163e-33
## ERVLE_1p34.3c         2.953931       4.629152 0.4146446 141.2407 1.425323e-32
## HML2_7q22.2         927.127296       3.950818 0.3492430 137.6423 8.725988e-32
## HML6_14q24.2         13.542162       5.113347 0.4463552 136.6914 1.408576e-31
## HERVI_16q22.2        79.982709      10.089581 0.7779156 134.5523 4.136857e-31
## ERVLE_17p11.2c      231.320369      -2.447170 0.1857741 133.0566 8.787103e-31
## LTR46_1p36.13         4.428574       5.966823 0.5536398 128.1465 1.042544e-29
## HERVL_21q22.2        10.671705       4.287555 0.3943519 128.1117 1.061013e-29
## PRIMA4_12p11.21a      7.244193       2.740667 0.2617433 121.6670 2.730203e-28
## MER4_11p15.4a         4.748947       2.391660 0.2390804 110.9362 6.110597e-26
## HML3_1p21.3           7.021049       3.324386 0.3386361 107.5918 3.302422e-25
## HERVE_6q15            5.955854       6.228608 0.6254671 107.1718 4.081993e-25
##                             padj
## HML2_3q13.2         9.300704e-39
## ERV316A3_19q13.2    1.142656e-37
## HML2_11q23.3        1.553116e-37
## HML2_19q11          6.457433e-37
## HERVL_3q21.2        4.992222e-36
## HERVIP10FH_17q21.32 2.284854e-34
## HML2_3q27.2         6.755989e-34
## HML3_7q36.3         1.517202e-31
## HERVS71_11q13.2     3.211597e-30
## ERVLE_1p34.3c       5.364916e-30
## HML2_7q22.2         2.985875e-29
## HML6_14q24.2        4.418234e-29
## HERVI_16q22.2       1.112224e-28
## ERVLE_17p11.2c      2.204977e-28
## LTR46_1p36.13       2.218697e-27
## HERVL_21q22.2       2.218697e-27
## PRIMA4_12p11.21a    5.408675e-26
## MER4_11p15.4a       1.150014e-23
## HML3_1p21.3         5.919198e-23
## HERVE_6q15          6.983918e-23
#Mets vs. All, excluding the batch hervs
herv_res_pm_filt <- restable(min_fc = 2, max_p = 0.01, n_top_genes = 20, DESeq.herv.pm, cont = c("Status","Metastatic","Primary"), exclude_gene = batch_hervs)
## Warning in if (exclude_gene != "") {: the condition has length > 1 and only the
## first element will be used
##                       baseMean log2FoldChange     lfcSE     stat       pvalue
## HML2_3q13.2           2.767446       5.240929 0.4065786 185.9204 2.470963e-42
## ERV316A3_19q13.2     12.567693       4.043621 0.3113588 179.5517 6.071500e-41
## HML2_11q23.3          8.365531       6.028128 0.4690936 178.1348 1.237872e-40
## HML2_19q11           20.850587       8.142469 0.5922696 174.7286 6.862310e-40
## HERVL_3q21.2         31.807864       5.196241 0.4022239 170.2177 6.631538e-39
## HERVIP10FH_17q21.32   6.563468       2.413798 0.1989844 162.2533 3.642169e-37
## HML2_3q27.2           8.151993       5.173398 0.4267370 159.7918 1.256427e-36
## HML3_7q36.3         183.126780      13.328096 0.7746528 148.7669 3.224660e-34
## HERVS71_11q13.2      29.115358       6.742577 0.5465766 142.4691 7.679163e-33
## ERVLE_1p34.3c         2.953931       4.629152 0.4146446 141.2407 1.425323e-32
## HML2_7q22.2         927.127296       3.950818 0.3492430 137.6423 8.725988e-32
## HERVI_16q22.2        79.982709      10.089581 0.7779156 134.5523 4.136857e-31
## ERVLE_17p11.2c      231.320369      -2.447170 0.1857741 133.0566 8.787103e-31
## LTR46_1p36.13         4.428574       5.966823 0.5536398 128.1465 1.042544e-29
## HERVL_21q22.2        10.671705       4.287555 0.3943519 128.1117 1.061013e-29
## PRIMA4_12p11.21a      7.244193       2.740667 0.2617433 121.6670 2.730203e-28
## MER4_11p15.4a         4.748947       2.391660 0.2390804 110.9362 6.110597e-26
## HML3_1p21.3           7.021049       3.324386 0.3386361 107.5918 3.302422e-25
## HERVE_6q15            5.955854       6.228608 0.6254671 107.1718 4.081993e-25
## MER34B_1q23.3a        3.711454       4.097588 0.4150451 106.9418 4.584321e-25
##                             padj
## HML2_3q13.2         9.300704e-39
## ERV316A3_19q13.2    1.142656e-37
## HML2_11q23.3        1.553116e-37
## HML2_19q11          6.457433e-37
## HERVL_3q21.2        4.992222e-36
## HERVIP10FH_17q21.32 2.284854e-34
## HML2_3q27.2         6.755989e-34
## HML3_7q36.3         1.517202e-31
## HERVS71_11q13.2     3.211597e-30
## ERVLE_1p34.3c       5.364916e-30
## HML2_7q22.2         2.985875e-29
## HERVI_16q22.2       1.112224e-28
## ERVLE_17p11.2c      2.204977e-28
## LTR46_1p36.13       2.218697e-27
## HERVL_21q22.2       2.218697e-27
## PRIMA4_12p11.21a    5.408675e-26
## MER4_11p15.4a       1.150014e-23
## HML3_1p21.3         5.919198e-23
## HERVE_6q15          6.983918e-23
## MER34B_1q23.3a      7.502341e-23
print(paste0(nrow(herv_res_pm_filt), " of HERVs were reported to be significant."))
## [1] "845 of HERVs were reported to be significant."
write.csv(herv_res_pm_filt, "herv_res_pm_filt.csv")

QQ Plot

herv_res_pm_q <- results(DESeq.herv.pm) %>% as.data.frame()
Norm_quantile<- sort(-log10(seq(1, 1/nrow(herv_res_pm_q), length.out = nrow(herv_res_pm_q))), decreasing = TRUE)
herv_res_pm_q <- herv_res_pm_q[order(herv_res_pm_q$padj, decreasing  = FALSE),]
herv_res_pm_q["Normal_Quantile"] <- Norm_quantile
ggplot(herv_res_pm_q, aes(x = Normal_Quantile, y =-log10(padj))) + geom_point()+geom_abline(intercept = 0, slope = 1, color="red") + xlab("Negative Log Normal Quantile (Expected)")+ylab("Negative Log 10 P Values (Observed)")+ggtitle("QQ Plot for HERV Primary vs Mets\nDesign=RECIST")
## Warning: Removed 2588 rows containing missing values (geom_point).

Heatmap

#pm_heat <- heattable(DESeq.herv.pm, herv_res_pm, select_string = "^(ERV|HERV|LTR|MER|HML).+",min_fc =2, max_p = 0.01) 
pm_heat_filt <- heattable(DESeq.herv.pm, herv_res_pm_filt, select_string = "^(ERV|HERV|LTR|MER|HML).+",min_fc =2, max_p = 0.01,exclude_genes = batch_hervs) 
pm_heat_clustering <- heattable(DESeq.herv.pm, herv_res_pm_filt, select_string = "^(ERV|HERV|LTR|MER|HML).+",min_fc =2, max_p = 0.01,select_genes = 50, exclude_genes = batch_hervs)
breaklist <- seq(-3, 3, by = 0.25)
#pheatmap(pm_heat, scale="row",main = "DE HERVs, All Primary vs. Pre-tx Metastatic\nCutoff p=0.01,Top 49 HERVs",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = FALSE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE], gaps_col = 80)

pheatmap(pm_heat_clustering, scale="row",main = "DE HERVs Clustering, Primary vs. Pre-tx Metastatic\nCutoff p= 0.01,Top 40 HERVs",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = TRUE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE], gaps_col = 80)

pheatmap(pm_heat_filt, scale="row",main = "DE HERVs, All Primary vs. Pre-tx Metastatic\nCutoff p= 0.01,Top 40 HERVs Excluding Batch",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = FALSE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE], gaps_col = 80)

png("DE HERVs Design Primary vs Metastatic.png", res = 1200, unit = "in", width = 12, height = 12)

pheatmap(pm_heat_filt, scale="row",main = "DE HERVs, All Primary vs. Pre-tx Metastatic\nCutoff p= 0.01,Top 40 HERVs Excluding Batch",fontsize = 20, size = 35, color = inferno(25), show_rownames = FALSE,cluster_cols = FALSE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE], gaps_col = 80)
png("DE HERVs Design Primary vs Metastatic.Clustered.png", res = 1200, unit = "in", width = 12, height = 12)

pheatmap(pm_heat_clustering, scale="row",main = "DE HERVs Clustering, Primary vs. Pre-tx Metastatic\nCutoff p= 0.01,Top 40 HERVs",fontsize = 20, size = 35, color = inferno(25), show_rownames = FALSE,cluster_cols = TRUE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE], gaps_col = 80)

Volcano

herv_res_all <- results(DESeq.herv.pm) %>% data.frame()
herv_res_all$hervs <- rownames(herv_res_all)
herv_res_all$hervs[(which(!(rownames(herv_res_all) %in% rownames(pm_heat_filt))))] <- ""
EnhancedVolcano(herv_res_all, lab = herv_res_all$hervs, x = 'log2FoldChange', y = 'padj', pCutoff = 0.01,FCcutoff = log2(2),title = "HERV Vocalno, Primary vs. Metastatic, p Cutoff = 0.01, Showing top 40 HERVs",titleLabSize = 30, labSize = 10,legendLabSize = 15,max.overlaps = Inf, drawConnectors = TRUE,arrowheads = FALSE,colAlpha = 1/2,col = c("black", "forestgreen", "yellow", "red2"),pointSize = 4)+theme( plot.title=element_text(size=45,face="bold"),axis.text=element_text(size=25),axis.title=element_text(size=25,face="bold"))

#ggsave("Valcano HERVs_PrimaryvsMets.png")

HERVs of Interest (17 Profile HERVs from Primary paper)

herv_profile <- c('ERVLE_17p11.2c','LTR46_Xq11.1','HERVH_6q24.1b','HERVH_3q21.3a','MER4B_3q21.2','HERVH_10q23.32','HML3_19q13.2','HERVI_9q22.1','MER4_1p21.3','HML3_3q21.2','ERVLE_5q13.2a','HERV3_19q13.42a','HERVE_Xp11.23','ERV316A3_6p21.33c','MER101_19q13.2c','ERVLE_2p13.2a','MER4B_12q22')
herv_res_17 <- herv_res_all[herv_profile,,drop = FALSE]
herv_res_17_sig <- subset(herv_res_17, padj <= 0.01)
print(paste0("There are ", nrow(herv_res_17_sig), " significantly DE HERVs between primary vs. Mets. As listed:" ))
## [1] "There are 9 significantly DE HERVs between primary vs. Mets. As listed:"
rownames(herv_res_17_sig)
## [1] "ERVLE_17p11.2c" "LTR46_Xq11.1"   "HERVH_3q21.3a"  "MER4B_3q21.2"  
## [5] "HERVH_10q23.32" "HERVI_9q22.1"   "ERVLE_5q13.2a"  "ERVLE_2p13.2a" 
## [9] "MER4B_12q22"
herv_res_17
##                      baseMean log2FoldChange     lfcSE         stat
## ERVLE_17p11.2c     231.320369    -2.44716980 0.1857741 133.05658977
## LTR46_Xq11.1      2084.308500    -1.59259318 0.1801353  65.81463790
## HERVH_6q24.1b       43.852405    -1.50449530 0.6871596   4.09095961
## HERVH_3q21.3a        2.978850    -1.99800006 0.5092303  13.76652668
## MER4B_3q21.2        74.287944    -0.99025872 0.2226502  17.96098383
## HERVH_10q23.32     127.738804    -3.60068196 0.4419191  44.21090281
## HML3_19q13.2       239.474843    -0.69242497 0.3171340   4.45184860
## HERVI_9q22.1       113.564864    -2.83310368 0.6061039  15.85432849
## MER4_1p21.3         56.698813    -0.63768465 0.2885318   4.61812195
## HML3_3q21.2          5.436287    -0.07509717 0.3636110   0.04305649
## ERVLE_5q13.2a        2.623870     1.42672476 0.2847621  27.90214455
## HERV3_19q13.42a      1.793846     0.73904242 0.3738196   4.61863662
## HERVE_Xp11.23       95.150772     0.27741746 0.2307454   1.49198307
## ERV316A3_6p21.33c 1553.189091     0.28266305 0.3919469   0.53288733
## MER101_19q13.2c      1.077808     1.21848724 0.7436845   3.33255678
## ERVLE_2p13.2a        3.594577     0.98863057 0.2876769  13.13696792
## MER4B_12q22          1.780568     2.51039744 0.4689431  33.06344866
##                         pvalue         padj          hervs
## ERVLE_17p11.2c    8.787103e-31 2.204977e-28 ERVLE_17p11.2c
## LTR46_Xq11.1      4.953894e-16 1.528398e-14               
## HERVH_6q24.1b     4.311316e-02 6.991725e-02               
## HERVH_3q21.3a     2.069917e-04 6.252943e-04               
## MER4B_3q21.2      2.254795e-05 8.461664e-05               
## HERVH_10q23.32    2.948346e-11 3.545551e-10               
## HML3_19q13.2      3.486349e-02 5.814186e-02               
## HERVI_9q22.1      6.840988e-05 2.335926e-04               
## MER4_1p21.3       3.163586e-02 5.344586e-02               
## HML3_3q21.2       8.356189e-01 9.101647e-01               
## ERVLE_5q13.2a     1.276083e-07 7.435255e-07               
## HERV3_19q13.42a   3.162637e-02 5.344586e-02               
## HERVE_Xp11.23     2.219090e-01 2.941076e-01               
## ERV316A3_6p21.33c 4.653955e-01 5.503119e-01               
## MER101_19q13.2c   6.792121e-02 1.039672e-01               
## ERVLE_2p13.2a     2.895260e-04 8.395808e-04               
## MER4B_12q22       8.919992e-09 6.583304e-08
heat_17 <- heattable(DESeq.herv.pm, herv_res_17, select_string = ".+",min_fc = 0, max_p = 1,select_genes = 17, exclude_genes = "")
herv_res_17$Significance <- ifelse(herv_res_17$padj <= 0.01, "*","NS")
herv_res_17 <- herv_res_17[order(herv_res_17$padj),]
pheatmap(heat_17, scale="row",main = "Selected HERVs\nPrimary vs. Pre-tx Metastatic",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = FALSE, cluster_rows = FALSE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Cluster", drop = FALSE], annotation_row =herv_res_17[,"Significance", drop = FALSE],  gaps_col = 80)

Invidual HERV Plotting

Function to create error bars

data_sum <- function(df, variable_row, grouping_row){
  g_factors <- as.factor(unique(df[,grouping_row]))
  sum_table <- data.frame(matrix(nrow = length(g_factors), ncol = 2))
  colnames(sum_table) <- c(variable_row, "sd")
  rownames(sum_table) <- g_factors
  for(i in 1:length(g_factors)){
    sub_mean <- mean(df[which(df[,grouping_row] == g_factors[i]),variable_row])
    sub_sd <- sd(df[which(df[,grouping_row] == g_factors[i]),variable_row])
    sum_table[i,c(1:2)] <- c(sub_mean,sub_sd)
  }
  sum_table[grouping_row] <- g_factors
  return(sum_table)
}

Significant count graphing

HML2_19q11 <- plotCounts(DESeq.herv.pm, gene = "HML2_19q11", intgroup = "Status", normalized = TRUE,returnData = TRUE)
HML2_summary<- data_sum(HML2_19q11, variable_row="count", grouping_row="Status")
ggplot(HML2_19q11, aes(x = Status, y = count)) +geom_point(aes(size = 1.5)) +  ggtitle("Normalized Counts for HML2_19q11c\nSeparated by Status")+geom_errorbar(aes(x = Status, ymin = count-sd, ymax = count+sd),data = HML2_summary, width = 0.08, col = "grey50") + geom_errorbar(aes(x = Status, ymin = count, ymax = count),data = HML2_summary, width = 0.08, col = "red2") +  theme( plot.title=element_text(size=16,face="bold"),axis.text=element_text(size=18),axis.title=element_text(size=20,face="bold"))

#ggsave("ERV316A3_3q13.12c Counts.png")
ERVLE_17p11.2c <- plotCounts(DESeq.herv.pm, gene = "ERVLE_17p11.2c", intgroup = "Status", normalized = TRUE,returnData = TRUE)
ERVLE_summary<- data_sum(ERVLE_17p11.2c, variable_row="count", grouping_row="Status")
ggplot(ERVLE_17p11.2c, aes(x = Status, y = count)) +geom_point(aes(size = 1.5)) +  ggtitle("Normalized Counts for ERVLE_17p11.2c\nSeparated by Status")+geom_errorbar(aes(x = Status, ymin = count-sd, ymax = count+sd),data = ERVLE_summary, width = 0.08, col = "grey50") + geom_errorbar(aes(x = Status, ymin = count, ymax = count),data = ERVLE_summary, width = 0.08, col = "red2") +  theme( plot.title=element_text(size=16,face="bold"),axis.text=element_text(size=18),axis.title=element_text(size=20,face="bold"))

Batch Genes

load("/Users/phoebefei/Library/CloudStorage/OneDrive-med.cornell.edu/WCM/Metastatic UM Prj/primary_UM/analysis2/03-gene_data.Rdata")
#genes.annot
DESeq.gene.batch <- DESeqDataSetFromMatrix(countData = as.matrix(all_genes[,rownames(batch_meta)]),
                                   colData = batch_meta,
                                   design = ~ Batch) 
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#size factor
DESeq.gene.batch <- estimateSizeFactors(DESeq.gene.batch)
#normalize & log-sized
DESeq.gene.batch_norm <- counts(DESeq.gene.batch, normalized = TRUE)
boxplot(log2(DESeq.gene.batch_norm+1), notch=FALSE, main = "Size-factor-normalized read counts", ylab="log2(read counts)", cex = .6, las = 2)

assay(DESeq.gene.batch,"log.norm.counts") <- log2(DESeq.gene.batch_norm+1)
DESeq.gene.batch <- DESeq(DESeq.gene.batch,parallel = T,test = "LRT", reduced = ~1)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates: 6 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 6 workers
## -- replacing outliers and refitting for 479 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
#resultsNames(DESeq.gene.batch)
#[1] "Intercept"              "Batch_Nilsson_vs_MSKCC"
gene_res_batch<-restable(min_fc = 2, max_p = 0.01, n_top_genes = 20, DESeq.gene.batch, cont = c("Batch","Nilsson","MSKCC"))
##                    baseMean log2FoldChange     lfcSE     stat       pvalue
## ENSG00000228657.1 16.417723      -7.168366 0.5457179 235.0056 4.827931e-53
## ENSG00000176183.8 10.088424      -6.453679 0.5295823 206.7366 7.077527e-47
## ENSG00000239622.1 18.475332      -8.135822 0.6198006 199.7079 2.418614e-45
## ENSG00000213176.4 15.145637      -7.849836 0.6074115 196.3232 1.325015e-44
## ENSG00000258401.1  6.242856      -6.336826 0.5341218 180.6060 3.573605e-41
## ENSG00000226131.1  8.246536      -4.393773 0.3933950 175.7147 4.179513e-40
## ENSG00000228861.5  7.212143      -6.308285 0.5467297 171.1407 4.169065e-39
## ENSG00000234751.1  6.810131      -6.219571 0.5445855 167.3931 2.745150e-38
## ENSG00000214846.4 10.839552      -7.367980 0.6424622 152.7831 4.272387e-35
## ENSG00000266989.1 14.971050      -7.830680 0.6955837 142.6662 6.953590e-33
## ENSG00000233990.1  7.688570      -6.872095 0.6324430 138.8215 4.818708e-32
## ENSG00000237793.3 27.989991      -7.084070 0.6576947 138.5502 5.524162e-32
## ENSG00000233873.1 46.357520      -5.222763 0.4800912 136.2956 1.719323e-31
## ENSG00000244331.1  4.591673      -5.401614 0.5406805 131.5076 1.917378e-30
## ENSG00000219863.5  5.380309      -5.132413 0.5336311 127.3175 1.583117e-29
## ENSG00000226915.1 13.711504      -4.590263 0.4554291 125.3578 4.249809e-29
## ENSG00000240459.2  4.186277      -5.629961 0.5654271 123.1292 1.306551e-28
## ENSG00000233741.1  7.668827      -5.046599 0.5369500 117.8588 1.861857e-27
## ENSG00000230807.1  4.621450      -5.154276 0.5529784 114.1847 1.187175e-26
## ENSG00000271207.1 25.336302      -6.177779 0.6299226 112.2933 3.081820e-26
##                           padj
## ENSG00000228657.1 5.473908e-49
## ENSG00000176183.8 4.012250e-43
## ENSG00000239622.1 9.140747e-42
## ENSG00000213176.4 3.755754e-41
## ENSG00000258401.1 8.103507e-38
## ENSG00000226131.1 7.897887e-37
## ENSG00000228861.5 6.752694e-36
## ENSG00000234751.1 3.890563e-35
## ENSG00000214846.4 5.382259e-32
## ENSG00000266989.1 7.883980e-30
## ENSG00000233990.1 4.966774e-29
## ENSG00000237793.3 5.219412e-29
## ENSG00000233873.1 1.499515e-28
## ENSG00000244331.1 1.552802e-27
## ENSG00000219863.5 1.196626e-26
## ENSG00000226915.1 3.011521e-26
## ENSG00000240459.2 8.713928e-26
## ENSG00000233741.1 1.172763e-24
## ENSG00000230807.1 7.084310e-24
## ENSG00000271207.1 1.747084e-23
batch_genes <- rownames(gene_res_batch)

Primary vs. Mets Genes

#create the object
DESeq.gene.pm <- DESeqDataSetFromMatrix(countData = as.matrix(all_genes[,rownames(pm_meta)]),
                                   colData = pm_meta,
                                   design = ~ Status) 
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.gene.pm$Status <- relevel(DESeq.gene.pm$Status, ref = "Primary")

Process count info

#size factor
DESeq.gene.pm <- estimateSizeFactors(DESeq.gene.pm)
#normalize & log-sized
DESeq.gene.pm_norm <- counts(DESeq.gene.pm, normalized = TRUE)
boxplot(log2(DESeq.gene.pm_norm+1), notch=FALSE, main = "Size-factor-normalized read counts", ylab="log2(read counts)", cex = .6, las = 2)

assay(DESeq.gene.pm,"log.norm.counts") <- log2(DESeq.gene.pm_norm+1)
DESeq.gene.pm$Age <- as.numeric(DESeq.gene.pm$Age)
#SD and PCA plot
msd.gene.pm <- vsn::meanSdPlot(DESeq.gene.pm@assays@data@listData$log.norm.counts, ranks=FALSE, plot = FALSE)
msd.gene.pm$gg + ggtitle("Sequencing depth normalized log2(read counts)") + ylab("standard deviation")

DESeq.gene.pm.trans <- DESeq(DESeq.gene.pm)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 685 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
gene.pm.vst <- assay(varianceStabilizingTransformation(DESeq.gene.pm.trans))
gene.pm.pca <- pca(gene.pm.vst, metadata = colData(DESeq.gene.pm.trans), removeVar = 0.1)
## -- removing the lower 10% of variables based on variance

PCA Visualize

screeplot(gene.pm.pca, axisLabSize = 18, titleLabSize = 22) + ggtitle("SCREE Plot for Genes, Primary + Metastatic")
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

biplot(gene.pm.pca, lab = NULL, colby = "Cluster",legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Cluster")

biplot(gene.pm.pca, lab = NULL, colby = "Status",legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Status")

biplot(gene.pm.pca, lab = NULL, colby = "toMet",legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Metastasis Development")

biplot(gene.pm.pca, lab = NULL, colby = "Batch",legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Batch")

biplot(gene.pm.pca, lab = NULL, colby = "Age", legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Age")

biplot(gene.pm.pca, lab = NULL, colby = "Sex", legendPosition = 'right') + ggtitle("PCA, Genes Primary + Mets\nColor by Sex")

PCA Loading Plot

plotloadings(gene.pm.pca,
             components = getComponents(gene.pm.pca, c(1,2)),
             rangeRetain = 0.1,
             labSize = 4.0,
             title = 'Loadings plot for primary + metastatic HERVs',
             subtitle = 'PC1, PC2',
             caption = 'Top 1% variables',
             shape = 24,
             col = c('limegreen', 'black', 'red3'),
             drawConnectors = TRUE)
## -- variables retained:
## ENSG00000227063.5, ENSG00000235174.1, ENSG00000174977.8, ENSG00000280614.1, ENSG00000280800.1, ENSG00000281181.1, ENSG00000225840.2, ENSG00000234857.2, ENSG00000261641.2, ENSG00000267469.1, ENSG00000267815.1, ENSG00000203812.2, ENSG00000225972.1, ENSG00000269378.1, ENSG00000231955.1, ENSG00000271959.1, ENSG00000279518.1, ENSG00000251634.2, ENSG00000179978.11, ENSG00000281183.1, ENSG00000256973.1, ENSG00000281026.1, ENSG00000255423.1, ENSG00000272037.1, ENSG00000243107.1, ENSG00000229152.2, ENSG00000272316.1

#max.overlap = Inf
DESeq.gene.pm <- DESeq(DESeq.gene.pm,parallel = T,test = "LRT", reduced = ~1)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates: 6 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 6 workers
## -- replacing outliers and refitting for 698 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

DE table on HERV

#resultsNames(DESeq.gene.pm)
#"Intercept"                    "Status_Primary_vs_Metastatic"
#Mets vs. All
gene_res_pm<-restable(min_fc = 2, max_p = 0.01, n_top_genes = 20, DESeq.gene.pm, cont = c("Status","Metastatic","Primary"), exclude_gene = batch_genes)
## Warning in if (exclude_gene != "") {: the condition has length > 1 and only the
## first element will be used
##                     baseMean log2FoldChange     lfcSE     stat pvalue padj
## ENSG00000189332.5   67.63813      -6.452664 0.2391559 1555.064      0    0
## ENSG00000214300.7  324.27627     -10.238325 0.3352725 1572.740      0    0
## ENSG00000230733.2  608.05457     -10.570670 0.3180922 1567.144      0    0
## ENSG00000243749.1  509.94560     -11.724037 0.3631247 1642.556      0    0
## ENSG00000244115.1  132.61647      -9.700952 0.3543101 1514.010      0    0
## ENSG00000254692.1 1458.64207     -13.239489 0.3664549 1799.684      0    0
## ENSG00000254873.1  573.75085      -8.794650 0.2197920 1622.318      0    0
## ENSG00000255455.2  182.89241     -10.249412 0.3507767 2202.089      0    0
## ENSG00000259623.1  271.30705     -10.816292 0.3551502 1950.021      0    0
## ENSG00000260772.1  482.53617     -11.645358 0.3631184 1623.774      0    0
## ENSG00000263244.2  661.22132     -12.101390 0.3544623 2507.753      0    0
## ENSG00000268713.1   95.39947      -9.147777 0.3471234 1700.779      0    0
## ENSG00000269243.1  589.24981      -9.923997 0.2830857 1730.878      0    0
## ENSG00000269604.1 1844.55160     -11.086664 0.2542043 2099.806      0    0
## ENSG00000269693.1  258.32903     -10.745414 0.3481672 2938.435      0    0
## ENSG00000270696.1  130.35486      -9.678766 0.3483114 2046.407      0    0
## ENSG00000274012.1 2456.59885     -13.990004 0.3750256 1536.887      0    0
## ENSG00000277801.1  285.99328     -10.502504 0.3396584 2454.753      0    0
## ENSG00000279033.1  425.71318     -10.361285 0.3227058 1823.566      0    0
## ENSG00000279095.1   87.90958      -9.189557 0.3517177 1622.531      0    0

Volcano

pm_gene <- heattable(DESeq.gene.pm, gene_res_pm, select_string = ".+",min_fc =2, max_p = 0.01, select_genes = 40, exclude_gene=batch_genes) 
gene_res_all <- results(DESeq.gene.pm) %>% data.frame()
gene_res_all$gene_names <- genes.annot[rownames(gene_res_all),"gene_name"]
gene_res_all$gene_names[which(!(rownames(gene_res_all) %in% rownames(pm_gene)))] <- ""
EnhancedVolcano(gene_res_all, lab = gene_res_all$gene_names, x = 'log2FoldChange', y = 'padj', pCutoff = 0.01,FCcutoff = log2(2),title = "Gene Vocalno, Primary vs. Metastatic, p Cutoff = 0.01, Showing top 40 Genes",titleLabSize = 30, labSize = 10,legendLabSize = 15,max.overlaps = Inf, drawConnectors = TRUE,arrowheads = FALSE,colAlpha = 1/2,col = c("black", "forestgreen", "yellow", "red2"),pointSize = 4)+theme( plot.title=element_text(size=45,face="bold"),axis.text=element_text(size=25),axis.title=element_text(size=25,face="bold"))
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...

#ggsave("Valcano HERVs_PrimaryvsMets.png")

Heatmap

rownames(pm_gene) <- genes.annot[rownames(pm_gene), "gene_name"]
pheatmap(pm_gene, scale="row",main = "Primary vs. Pre-tx Metastatic Genes\n Excluding Batch, pCut = 0.01",fontsize = 20, size = 35, color = inferno(25), show_rownames = TRUE,cluster_cols = TRUE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE],  gaps_col = 80)

png("PM Gene Heatmap.png", res = 1200, unit = "in", width = 12, height = 12)
pheatmap(pm_gene, scale="row",main = "Primary vs. Pre-tx Metastatic Genes\n Excluding Batch, pCut = 0.01",fontsize = 20, size = 35, color = inferno(25), show_rownames = FALSE,cluster_cols = TRUE, cluster_rows = TRUE, breaks = breaklist, legend_labels = "Expression Z-score", annotation_col = pm_meta[,"Status", drop = FALSE],  gaps_col = 80)